# download_net_issue_live.py
# NET (Nuclear Energy and Technology) Downloader
# -------------------------------------------------
# Automates downloading PDFs from NET issue pages on the Pensoft platform.
# - Accepts a live issue URL as input.
# - Parses issue metadata from <div class="issueNo"> to extract Volume, Issue, and Year.
# - Retrieves article titles and PDF links from structured HTML blocks.
# - Resolves relative PDF URLs using urljoin to ensure proper HTTPS links.
# - Creates output folder in the format: NET_Vol{vol}_Issue{iss}_{year}.
# - Saves PDFs using sanitized titles (Windows-safe filenames).
# - Logs all downloads (title, article URL, PDF URL, status) to a CSV file.
# - Compatible with all NET issues and adaptable to other Pensoft journals.

"""
Downloader for Nuclear Energy and Technology (NET, Pensoft) issues via LIVE URL.

• Input: Issue URL (e.g., https://nucet.pensoft.net/issue/3405/) via CLI arg or prompt
• Output folder: ./NET_Vol{vol}_Issue{iss}_{year}
• Log CSV inside the output folder

Notes
-----
- Robustly parses volume/issue/year from the header (e.g., "Nuclear Energy and Technology 6(1) (2020)") or sidebar.
- Finds each article block (Pensoft layout) and extracts title + PDF link.
- Resolves relative URLs using <base href> or the issue URL's origin.
- Skips entries without a PDF link.

Usage
-----
python download_net_issue_live.py "https://nucet.pensoft.net/issue/3405/"
# Or just run without args and paste the URL when prompted.
"""


import os
import re
import csv
import sys
import time
from pathlib import Path
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup


# ---------- Config ----------
FIXED_ROOT = Path(__file__).resolve().parent  # Output directly in script directory

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
}
TIMEOUT = 60
RETRY_SLEEP = 2
MAX_RETRIES = 3


def sanitize_filename(name: str) -> str:
    name = re.sub(r"[\\/*?:\"<>|]", "", name)
    name = re.sub(r"\s+", " ", name).strip()
    name = re.sub(r"\.+", ".", name).strip(". ")
    return name[:180]


def get_soup_from_url(url: str) -> BeautifulSoup:
    last_exc = None
    for _ in range(MAX_RETRIES):
        try:
            r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
            r.raise_for_status()
            return BeautifulSoup(r.text, "html.parser")
        except Exception as e:
            last_exc = e
            time.sleep(RETRY_SLEEP)
    raise last_exc


def detect_base_url(soup: BeautifulSoup, issue_url: str) -> str:
    base = soup.find("base")
    if base and base.get("href"):
        return base["href"].strip()
    p = urlparse(issue_url)
    return f"{p.scheme}://{p.netloc}/"


def parse_issue_meta(soup: BeautifulSoup):
    header = soup.select_one(".issue_title_identifier .zag")
    text = header.get_text(" ", strip=True) if header else ""

    m = re.search(r"\b(\d+)\s*\(\s*(\d+)\s*\)\s*\(?((?:19|20)\d{2})?\)?", text)
    if m:
        vol, iss, year = m.group(1), m.group(2), (m.group(3) or "Year")
        return vol, iss, year

    issue_no = soup.select_one(".issueNo")
    vol_iss = re.search(r"Issue\s+(\d+)\((\d+)\)", issue_no.get_text(strip=True) if issue_no else "")
    vol, iss = (vol_iss.group(1), vol_iss.group(2)) if vol_iss else ("Vol", "Issue")

    year_m = re.search(r"\b(19|20)\d{2}\b", soup.get_text(" "))
    year = year_m.group(0) if year_m else "Year"
    return vol, iss, year


def collect_articles(soup: BeautifulSoup):
    articles = []
    article_blocks = soup.select("div.ArtBrowseInfo, div.article, div[class*='ArtBrowseInfo']")

    for art in article_blocks:
        title_a = art.select_one(".articleHeadline a[href*='/article/']")
        if not title_a:
            title_a = art.select_one("a[href^='/article/']")
        if not title_a:
            continue
        title = title_a.get_text(" ", strip=True)

        pdf_a = None
        for a in art.select("a[href]"):
            t = (a.get("title") or "").strip().lower()
            txt = a.get_text(strip=True).lower()
            href = a.get("href", "")
            if t == "pdf" or txt == "pdf" or "/download/pdf/" in href:
                pdf_a = a
                break
        if not pdf_a:
            continue

        articles.append((title, title_a.get("href", ""), pdf_a.get("href", "")))

    seen = set()
    uniq = []
    for t, ah, ph in articles:
        key = (ah.strip(), ph.strip())
        if key in seen:
            continue
        seen.add(key)
        uniq.append((t, ah, ph))

    return uniq


def ensure_pdf_response(resp: requests.Response) -> bool:
    ctype = (resp.headers.get("Content-Type") or "").lower()
    if "pdf" in ctype:
        return True
    head = resp.content[:5]
    return head == b"%PDF-"


def download_file(url: str, dest: Path) -> None:
    last_exc = None
    for _ in range(MAX_RETRIES):
        try:
            r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
            r.raise_for_status()
            if not ensure_pdf_response(r):
                raise ValueError(f"Non-PDF response (Content-Type={r.headers.get('Content-Type')})")
            with open(dest, "wb") as f:
                f.write(r.content)
            return
        except Exception as e:
            last_exc = e
            time.sleep(RETRY_SLEEP)
    raise last_exc


def main():
    if len(sys.argv) >= 2:
        issue_url = sys.argv[1].strip()
    else:
        issue_url = input("Paste NET issue URL (e.g., https://nucet.pensoft.net/issue/3405/): ").strip()

    if not issue_url:
        print("ERROR: No URL provided.")
        sys.exit(1)

    print(f"[INFO] Fetching issue page: {issue_url}")
    soup = get_soup_from_url(issue_url)

    base_url = detect_base_url(soup, issue_url)
    vol, iss, year = parse_issue_meta(soup)

    out_folder = FIXED_ROOT / f"NET_Vol{vol}_Issue{iss}_{year}"
    out_folder.mkdir(parents=True, exist_ok=True)

    log_path = out_folder / f"NET_Vol{vol}_Issue{iss}_{year}_log.csv"
    with open(log_path, "w", newline="", encoding="utf-8") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Title", "Article URL", "PDF URL", "Filename", "Status"])

        articles = collect_articles(soup)
        print(f"[INFO] Found {len(articles)} candidate items with PDFs")

        saved = 0
        for idx, (title, art_rel, pdf_rel) in enumerate(articles, 1):
            art_url = urljoin(base_url, art_rel)
            pdf_url = urljoin(base_url, pdf_rel)
            safe_title = sanitize_filename(title)
            outfile = out_folder / f"{safe_title}.pdf"

            if outfile.exists() and outfile.stat().st_size > 0:
                writer.writerow([title, art_url, pdf_url, outfile.name, "Exists"])
                print(f"[{idx}] ✅ Exists: {outfile.name}")
                continue

            print(f"[{idx}] Downloading: {safe_title}")
            try:
                download_file(pdf_url, outfile)
                writer.writerow([title, art_url, pdf_url, outfile.name, "OK"])
                print(f"    ✅ Saved: {outfile.name}")
                saved += 1
            except Exception as e:
                writer.writerow([title, art_url, pdf_url, outfile.name, f"Error: {e}"])
                print(f"    ❌ Error: {e}")

    print(f"\nDone! {saved} PDFs saved in {out_folder}")
    print(f"Log: {log_path}")


if __name__ == "__main__":
    main()
